In [45]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from dynaconf import Dynaconf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.inspection import PartialDependenceDisplay
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import matplotlib.pyplot as plt
import seaborn as sns
import math
import shap

import joblib
In [2]:
#Load matchup_feature data
settings = Dynaconf(envvar_prefix='MYAPP', load_dotenv=True)
DB_URL = settings.DB_ENGINE_URL

engine = create_engine(DB_URL)

with engine.begin() as conn:
    matchup_df = pd.read_sql('SELECT * FROM matchup_features', conn)
In [4]:
#Clean/Explore
matchup_df.head()
Out[4]:
game_id season week elo_diff points_per_game_diff points_allowed_per_game_diff recent_points_per_game_diff recent_points_allowed_per_game_diff margin_of_victory_diff win_rate_diff ... yards_allowed_per_play_diff explosiveness_diff success_rate_diff travel_distance rest_days_diff recent_form_diff neutral_site vegas_spread_close vegas_over_under_close home_win
0 400603827 2015 1 28.86 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 687.645 -9.223370e+18 NaN 1 -12.0 44.0 1
1 400603828 2015 1 29.71 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 767.009 -9.223370e+18 NaN 0 -32.5 49.5 1
2 400603829 2015 1 19.42 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 386.583 -9.223370e+18 NaN 1 -10.5 54.0 1
3 400603830 2015 1 20.00 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 1456.180 -9.223370e+18 NaN 0 -34.0 54.0 1
4 400603831 2015 1 29.71 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 512.644 -9.223370e+18 NaN 0 -35.0 57.0 1

5 rows × 21 columns

In [94]:
matchup_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7723 entries, 0 to 7722
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   game_id                              7723 non-null   int64  
 1   season                               7723 non-null   int64  
 2   week                                 7723 non-null   int64  
 3   elo_diff                             7723 non-null   float64
 4   points_per_game_diff                 6598 non-null   float64
 5   points_allowed_per_game_diff         6598 non-null   float64
 6   recent_points_per_game_diff          6598 non-null   float64
 7   recent_points_allowed_per_game_diff  6598 non-null   float64
 8   margin_of_victory_diff               6598 non-null   float64
 9   win_rate_diff                        6598 non-null   float64
 10  yards_per_play_diff                  6497 non-null   float64
 11  yards_allowed_per_play_diff          6497 non-null   float64
 12  explosiveness_diff                   6497 non-null   float64
 13  success_rate_diff                    6497 non-null   float64
 14  travel_distance                      7723 non-null   float64
 15  rest_days_diff                       7723 non-null   float64
 16  recent_form_diff                     6598 non-null   float64
 17  neutral_site                         7723 non-null   int64  
 18  vegas_spread_close                   7692 non-null   float64
 19  vegas_over_under_close               7648 non-null   float64
 20  home_win                             7723 non-null   int64  
dtypes: float64(16), int64(5)
memory usage: 1.2 MB
In [5]:
matchup_df.describe()
Out[5]:
game_id season week elo_diff points_per_game_diff points_allowed_per_game_diff recent_points_per_game_diff recent_points_allowed_per_game_diff margin_of_victory_diff win_rate_diff ... yards_allowed_per_play_diff explosiveness_diff success_rate_diff travel_distance rest_days_diff recent_form_diff neutral_site vegas_spread_close vegas_over_under_close home_win
count 7.723000e+03 7723.000000 7723.000000 7723.000000 6598.000000 6598.000000 6598.000000 6598.000000 6598.000000 6598.000000 ... 6497.000000 6497.000000 6497.000000 7723.000000 7.723000e+03 6598.000000 7723.000000 7692.000000 7648.000000 7723.000000
mean 4.011980e+08 2019.667487 7.181536 9.373491 0.080466 -0.356659 -0.109427 -0.233859 0.437125 0.006678 ... -0.024634 -0.001210 0.001587 594.044639 -1.343514e+18 0.000682 0.078208 -4.275936 55.140167 0.578273
std 2.933903e+05 3.006470 4.122218 126.253496 11.720626 11.673811 13.694397 13.583830 18.221066 0.395722 ... 0.997241 0.191148 0.076994 538.927423 3.253993e+18 0.473894 0.268516 13.981723 8.214623 0.493867
min 4.006038e+08 2015.000000 1.000000 -519.580000 -66.000000 -63.000000 -66.000000 -63.000000 -86.000000 -1.000000 ... -4.541810 -1.311670 -0.331206 2.562420 -9.223370e+18 -1.000000 0.000000 -54.500000 27.500000 0.000000
25% 4.009375e+08 2017.000000 4.000000 -67.995000 -7.500000 -7.666670 -9.333330 -9.000000 -11.333300 -0.250000 ... -0.630453 -0.114764 -0.046310 244.819500 -5.184000e+14 -0.333333 0.000000 -13.500000 49.000000 0.000000
50% 4.012072e+08 2020.000000 7.000000 9.430000 0.000000 -0.250000 0.000000 -0.333333 0.166667 0.000000 ... -0.019434 0.000949 0.001684 448.965000 0.000000e+00 0.000000 0.000000 -3.500000 54.500000 1.000000
75% 4.014264e+08 2022.000000 11.000000 86.955000 7.714290 7.285710 9.000000 9.000000 11.833300 0.266667 ... 0.601632 0.113740 0.049644 770.691000 0.000000e+00 0.333333 0.000000 5.000000 60.500000 1.000000
max 4.017628e+08 2025.000000 16.000000 567.320000 59.000000 53.333300 59.000000 53.333300 90.000000 1.000000 ... 6.387010 1.099930 0.320819 5014.590000 1.088640e+16 1.000000 1.000000 54.000000 90.000000 1.000000

8 rows × 21 columns

In [6]:
#Missing Values
matchup_df.isnull().sum()
Out[6]:
game_id                                   0
season                                    0
week                                      0
elo_diff                                  0
points_per_game_diff                   1125
points_allowed_per_game_diff           1125
recent_points_per_game_diff            1125
recent_points_allowed_per_game_diff    1125
margin_of_victory_diff                 1125
win_rate_diff                          1125
yards_per_play_diff                    1226
yards_allowed_per_play_diff            1226
explosiveness_diff                     1226
success_rate_diff                      1226
travel_distance                           0
rest_days_diff                            0
recent_form_diff                       1125
neutral_site                              0
vegas_spread_close                       31
vegas_over_under_close                   75
home_win                                  0
dtype: int64
In [7]:
sns.heatmap(matchup_df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()
No description has been provided for this image
In [8]:
#Outlier detection
numeric_cols = matchup_df.select_dtypes(include=[np.number]).columns
scaled = pd.DataFrame(StandardScaler().fit_transform(matchup_df[numeric_cols]), columns=numeric_cols)
scaled.boxplot(figsize=(15,6))
plt.xticks(rotation=90)
plt.title("Boxplot of Standardized Features")
plt.show()
No description has been provided for this image
In [99]:
numeric_cols = matchup_df.select_dtypes(include=[np.number]).columns
cols = 3  # Number of columns in the grid
rows = int(np.ceil(len(numeric_cols) / cols))

fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    # Highlight outliers for this column
    outlier_mask = np.abs(matchup_df[col]) > matchup_df[col].quantile(0.99)
    sns.scatterplot(
        x=matchup_df[col],
        y=matchup_df['elo_diff'],
        hue=outlier_mask,
        ax=axes[idx],
        palette={True: "red", False: "blue"},
        legend=False
    )
    axes[idx].set_title(f"{col} vs elo_diff")
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel("elo_diff")

# Hide any unused subplots
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

fig.suptitle("Scatterplots of Numeric Features vs. Elo Diff (Outliers Highlighted)", fontsize=18, y=1.02)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
#Correlation analysis
corr = matchup_df.corr(numeric_only=True)
plt.figure(figsize=(16,12))  # Larger figure for clarity
sns.heatmap(
    corr,
    annot=True,
    cmap='coolwarm',
    annot_kws={"size": 10}  # Adjust annotation font size
)
plt.title("Correlation Heatmap of Numeric Features", fontsize=18)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()
No description has been provided for this image
In [11]:
#Target Variable Distribution
sns.histplot(matchup_df['home_win'].astype(float), bins=2)
plt.title('Target Distribution')
plt.xlabel('Home Loss (0)              Home Win (1)')
plt.show()
No description has been provided for this image
In [12]:
#Check how many games were played at neutral sites
print(matchup_df[matchup_df['neutral_site'] == 1].shape[0])
604
In [20]:
#Since those games will have erroneous travel distances, drop those games
matchup_df = matchup_df[matchup_df['neutral_site'] == 0]
matchup_df = matchup_df.drop(columns=['neutral_site'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3811 try:
-> 3812     return self._engine.get_loc(casted_key)
   3813 except KeyError as err:

File pandas/_libs/index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7096, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'neutral_site'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[20], line 2
      1 #Since those games will have erroneous travel distances, drop those games
----> 2 matchup_df = matchup_df[matchup_df['neutral_site'] == 0]
      3 matchup_df = matchup_df.drop(columns=['neutral_site'])

File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/frame.py:4107, in DataFrame.__getitem__(self, key)
   4105 if self.columns.nlevels > 1:
   4106     return self._getitem_multilevel(key)
-> 4107 indexer = self.columns.get_loc(key)
   4108 if is_integer(indexer):
   4109     indexer = [indexer]

File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/indexes/base.py:3819, in Index.get_loc(self, key)
   3814     if isinstance(casted_key, slice) or (
   3815         isinstance(casted_key, abc.Iterable)
   3816         and any(isinstance(x, slice) for x in casted_key)
   3817     ):
   3818         raise InvalidIndexError(key)
-> 3819     raise KeyError(key) from err
   3820 except TypeError:
   3821     # If we have a listlike key, _check_indexing_error will raise
   3822     #  InvalidIndexError. Otherwise we fall through and re-raise
   3823     #  the TypeError.
   3824     self._check_indexing_error(key)

KeyError: 'neutral_site'
In [21]:
#Drop rows with missing values
matchup_df = matchup_df.dropna()
In [22]:
#Define features and target
X = matchup_df.drop(columns=['home_win', 'game_id', 'season', 'week']).copy()
y = matchup_df['home_win'].astype(float).copy()
In [44]:
# Pairplot for selected features and target
pairplot_df = matchup_df.drop(columns=['game_id', 'season', 'week']).copy()
sns.pairplot(pairplot_df, hue='home_win', diag_kind='kde')
plt.suptitle('Pairplot of Key Features', y=1.02)
plt.show()
No description has been provided for this image
In [107]:
# Violin plots for feature distributions by target
num_features = len(X.columns)
cols = 3  # Number of columns in the grid
rows = math.ceil(num_features / cols)

fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
axes = axes.flatten()

for idx, col in enumerate(X.columns):
    sns.violinplot(x=y, y=X[col], ax=axes[idx])
    axes[idx].set_title(f'Violin Plot of {col} by Home Win')

# Hide any unused subplots
for idx in range(num_features, len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [108]:
# Feature-target scatter plots
num_features = len(X.columns)
cols = 3  # Number of columns in the grid
rows = math.ceil(num_features / cols)

fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
axes = axes.flatten()

for idx, col in enumerate(X.columns):
    sns.scatterplot(x=X[col], y=y, ax=axes[idx])
    axes[idx].set_title(f'Scatter Plot: {col} vs Home Win')

# Hide any unused subplots
for idx in range(num_features, len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [109]:
# PCA for dimensionality reduction visualization
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y)
plt.title('PCA Projection of Features')
plt.show()
No description has been provided for this image
In [23]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [24]:
# Feature scaling for ElasticNet
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [25]:
# ElasticNetCV (cross-validated hyperparameter tuning)
elastic_cv = ElasticNetCV(
    l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
    alphas=np.logspace(-4, 2, 50),
    cv=5,
    random_state=42
)
elastic_cv.fit(X_train_scaled, y_train)
Out[25]:
ElasticNetCV(alphas=array([1.00000000e-04, 1.32571137e-04, 1.75751062e-04, 2.32995181e-04,
       3.08884360e-04, 4.09491506e-04, 5.42867544e-04, 7.19685673e-04,
       9.54095476e-04, 1.26485522e-03, 1.67683294e-03, 2.22299648e-03,
       2.94705170e-03, 3.90693994e-03, 5.17947468e-03, 6.86648845e-03,
       9.10298178e-03, 1.20679264e-02, 1.59985872e-02, 2.12095089e-02,
       2.81176870e-02, 3.727593...
       2.68269580e-01, 3.55648031e-01, 4.71486636e-01, 6.25055193e-01,
       8.28642773e-01, 1.09854114e+00, 1.45634848e+00, 1.93069773e+00,
       2.55954792e+00, 3.39322177e+00, 4.49843267e+00, 5.96362332e+00,
       7.90604321e+00, 1.04811313e+01, 1.38949549e+01, 1.84206997e+01,
       2.44205309e+01, 3.23745754e+01, 4.29193426e+01, 5.68986603e+01,
       7.54312006e+01, 1.00000000e+02]),
             cv=5, l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
             random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
l1_ratio  [0.1, 0.5, ...]
eps  0.001
n_alphas  'deprecated'
alphas  array([1.0000...00000000e+02])
fit_intercept  True
precompute  'auto'
max_iter  1000
tol  0.0001
cv  5
copy_X  True
verbose  0
n_jobs  None
positive  False
random_state  42
selection  'cyclic'
In [113]:
print("ElasticNet best alpha:", elastic_cv.alpha_)
print("ElasticNet best l1_ratio:", elastic_cv.l1_ratio_)
ElasticNet best alpha: 0.003906939937054617
ElasticNet best l1_ratio: 0.5
In [26]:
# ElasticNet predictions and evaluation
y_pred_enet = elastic_cv.predict(X_test_scaled)
print("ElasticNet RMSE:", root_mean_squared_error(y_test, y_pred_enet))
print("ElasticNet MAE:", mean_absolute_error(y_test, y_pred_enet))
print("ElasticNet R2:", r2_score(y_test, y_pred_enet))
ElasticNet RMSE: 0.41063762199089504
ElasticNet MAE: 0.3564100410227748
ElasticNet R2: 0.28469829916101685
In [27]:
# Feature importances (coefficients)
coef_df = pd.Series(elastic_cv.coef_, index=X.columns)
coef_df.sort_values(ascending=False).plot(kind='bar', figsize=(12,4), title='ElasticNet Coefficients')
plt.show()
No description has been provided for this image
In [47]:
# Gradient Boosting hyperparameter tuning with GridSearchCV
gbc = GradientBoostingClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("GB best parameters:", grid_search.best_params_)
GB best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
In [50]:
# Gradient Boosting hyperparameter tuning with GridSearchCV
gbc = GradientBoostingClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
gb_grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

print("GB best parameters:", gb_grid_search.best_params_)

gbc_best = gb_grid_search.best_estimator_
y_pred_gbc = gbc_best.predict(X_test)
y_proba_gbc = gbc_best.predict_proba(X_test)[:, 1]

print("GB Classifier Accuracy:", accuracy_score(y_test, y_pred_gbc))
print("GB Classifier ROC AUC:", roc_auc_score(y_test, y_proba_gbc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gbc))
python(47003) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47004) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47005) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47006) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47007) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47008) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47009) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(47010) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
GB best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
GB Classifier Accuracy: 0.7635350318471338
GB Classifier ROC AUC: 0.8333109249120694
Confusion Matrix:
 [[320 158]
 [139 639]]
In [ ]:
print("Classification report: \n": classification_report(y_test, y_pred_gbc))
In [30]:
# Feature importances (Gradient Boosting)
feat_imp = pd.Series(gb_best.feature_importances_, index=X.columns)
feat_imp.sort_values(ascending=False).plot(kind='bar', figsize=(12,4), title='GB Feature Importances')
plt.show()
No description has been provided for this image
In [37]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

# Grid search with cross-validation
grid_search_logreg = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
grid_search_logreg.fit(X_train, y_train)

# Best model and predictions
best_logreg = grid_search_logreg.best_estimator_
y_pred_logreg = best_logreg.predict(X_test)
y_proba_logreg = best_logreg.predict_proba(X_test)[:, 1]

print("Best Logistic Regression parameters:", grid_search_logreg.best_params_)
print("Tuned Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Tuned Logistic Regression ROC AUC:", roc_auc_score(y_test, y_proba_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))
Best Logistic Regression parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Tuned Logistic Regression Accuracy: 0.7635350318471338
Tuned Logistic Regression ROC AUC: 0.8290784223037291
Confusion Matrix:
 [[335 143]
 [154 624]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.69      0.70      0.69       478
         1.0       0.81      0.80      0.81       778

    accuracy                           0.76      1256
   macro avg       0.75      0.75      0.75      1256
weighted avg       0.76      0.76      0.76      1256

In [51]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

rf = grid_search.best_estimator_  # Store the best model as 'rf'
print("Best parameters:", grid_search.best_params_)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
Best parameters: {'max_depth': 5, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest Accuracy: 0.7691082802547771
Random Forest ROC AUC: 0.8320363339105743
Confusion Matrix:
 [[326 152]
 [138 640]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.68      0.69       478
         1.0       0.81      0.82      0.82       778

    accuracy                           0.77      1256
   macro avg       0.76      0.75      0.75      1256
weighted avg       0.77      0.77      0.77      1256

In [52]:
# Residual plots after model fitting
plt.figure(figsize=(6,3))
sns.histplot(y_test - y_pred_rf, bins=30, kde=True)
plt.title('Random Forest Residuals')
plt.show()
No description has been provided for this image
In [ ]:
# Residual plots after model fitting
plt.figure(figsize=(6,3))
sns.histplot(y_test - y_pred_enet, bins=30, kde=True)
plt.title('ElasticNet Residuals')
plt.show()
No description has been provided for this image
In [48]:
plt.figure(figsize=(6,3))
sns.histplot(y_test - y_pred_gbc, bins=30, kde=True)
plt.title('Gradient Boosting Residuals')
plt.show()
No description has been provided for this image
In [ ]:
 
No description has been provided for this image
In [40]:
# Residuals for Logistic Regression (difference between true label and predicted probability)
residuals_logreg = y_test - y_proba_logreg

plt.figure(figsize=(6,3))
sns.histplot(residuals_logreg, bins=30, kde=True)
plt.title('Logistic Regression Residuals')
plt.xlabel('Residual (True Label - Predicted Probability)')
plt.show()
No description has been provided for this image
In [ ]:
explainer = shap.Explainer(gb_best, X_test)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
No description has been provided for this image
In [42]:
# Feature importances (Logistic Regression coefficients from tuned model)
coef_logreg_tuned = pd.Series(best_logreg.coef_[0], index=X.columns)
coef_logreg_tuned.sort_values(ascending=False).plot(kind='bar', figsize=(12,4), title='Logistic Regression Coefficients')
plt.show()
No description has been provided for this image
In [ ]:
# Feature importances (Random Forest)
feat_imp_rf = pd.Series(rf.feature_importances_, index=X.columns)
feat_imp_rf.sort_values(ascending=False).plot(kind='bar', figsize=(12,4), title='Random Forest Feature Importances')
plt.show()
No description has been provided for this image
In [ ]:
# Vegas spread rule-based prediction
vegas_pred = (X_test['vegas_spread_close'] < 0).astype(float)

print("Vegas Spread Accuracy:", accuracy_score(y_test, vegas_pred))
print("Vegas Spread ROC AUC:", roc_auc_score(y_test, vegas_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, vegas_pred))
print("Classification Report:\n", classification_report(y_test, vegas_pred))
Vegas Spread Accuracy: 0.7512155591572123
Vegas Spread ROC AUC: 0.737463392779089
Confusion Matrix:
 [[331 165]
 [142 596]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.67      0.68       496
         1.0       0.78      0.81      0.80       738

    accuracy                           0.75      1234
   macro avg       0.74      0.74      0.74      1234
weighted avg       0.75      0.75      0.75      1234

Model Comparison: Random Forest (Tuned) vs. Logistic Regression vs. Gradient Boosting vs. ElasticNet vs. Vegas Spread Rule¶

Random Forest (Tuned)

  • Best Parameters:
    • n_estimators: 300
    • max_depth: 5
    • min_samples_split: 5
    • max_features: 'sqrt'
  • Accuracy: 0.76
  • ROC AUC: 0.83
  • F1-score (weighted): 0.76

Logistic Regression (Tuned)

  • Best Parameters:
    • C: 0.1
    • penalty: 'l1'
    • solver: 'liblinear'
    • max_iter: 1000
  • Accuracy: 0.76
  • ROC AUC: 0.83
  • F1-score (weighted): 0.76

Gradient Boosting (Tuned)

  • Best Parameters:
    • n_estimators: 200
    • max_depth: 5
    • learning_rate: 0.1
    • subsample: 1.0
  • Accuracy: 0.75
  • ROC AUC: 0.81
  • F1-score (weighted): 0.75

ElasticNet Regression (Tuned)

  • Best Parameters:
    • alpha: 0.015998587196060572
    • l1_ratio: 0.1
  • RMSE: 0.4090
  • MAE: 0.3574
  • R2: 0.3040

Vegas Spread Rule

  • Accuracy: 0.75
  • ROC AUC: 0.74
  • F1-score (weighted): 0.75

Interpretation:
The tuned Random Forest model outperforms the other models and the Vegas spread rule on all major metrics (accuracy, ROC AUC, F1-score).
Gradient Boosting, Logistic Regression, and ElasticNet are competitive, but Random Forest provides the best overall performance, especially in distinguishing wins and losses (higher ROC AUC).

Summary:
The Random Forest model, after hyperparameter tuning, is the top performer and meets the project goal.

In [38]:
# Save model
#joblib.dump(rf, 'random_forest_model.pkl')
In [39]:
# For one or more features, e.g. 'elo_diff' and 'margin_of_victory_diff'
features_to_plot = ['elo_diff', 'margin_of_victory_diff', 'vegas_spread_close']

PartialDependenceDisplay.from_estimator(
    rf,                
    X_test,            
    features_to_plot,  
    kind='average',    
    grid_resolution=50,
)
plt.suptitle('Partial Dependence Plots (Random Forest)')
plt.show()
No description has been provided for this image
In [ ]: